Based on kaggle dataset
https://www.kaggle.com/ludobenistant/predictive-maintenance-1/data
Step by step approach with pandas data frames, then have a HL view of Dask possibilities, then see NN of the data.
In [1]:
# Import Libraries needed
import pandas as pd #dataframe manipulation
import numpy as np #numerical processing of vectors
import matplotlib.pyplot as plt #plotting
%matplotlib inline
#import tensorflow as tf
import sklearn
from sklearn import tree
import graphviz
import dask
print("Pandas:\t\t", pd.__version__)
print("Numpy:\t\t", np.__version__)
#print("Tensorflow:\t", tf.__version__)
print("Dask:\t\t", dask.__version__)
print("Scikit-learn:\t", sklearn.__version__)
In [2]:
df_init = df = pd.read_csv('./maintenance_data.csv')
In [3]:
df.columns
Out[3]:
In [4]:
df.head()
Out[4]:
In [5]:
df.describe()
Out[5]:
In [6]:
df.sort_values(by='lifetime', ascending=True).head()
Out[6]:
In [7]:
df.sort_values(by='lifetime', ascending=True).tail()
Out[7]:
In [8]:
plt.bar(df.sort_values('team').team, df.sort_values('lifetime').lifetime)
Out[8]:
In [9]:
df.groupby([df.team, df.broken]).count()
Out[9]:
In [10]:
df.groupby(['team','broken']).agg({'broken': 'count'}).apply(lambda x:100 * x / float(x.sum()))
Out[10]:
In [11]:
show_perc = df.groupby(['team','broken']).agg({'broken': 'count'})
show_perc.apply(lambda x:100 * x / float(x.sum()))
Out[11]:
In [12]:
column = 'provider'
show_perc = df.loc[df['broken'] == 1].groupby([column]).agg({'broken': 'count'})
show_perc.apply(lambda x:round(100 * x / float(x.sum()),2)).rename(columns={"broken": "%"})
Out[12]:
Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.
http://scikit-learn.org/stable/modules/tree.html
In [13]:
tree_data = df_init.drop(columns=['broken'])
tree_target = df_init.broken
#workaround replacement strings to integers - DO NOT DO IT LIKE THIS ;-)
try:
tree_data.replace('TeamA',1, inplace=True)
tree_data.replace('TeamB',2, inplace=True)
tree_data.replace('TeamC',3, inplace=True)
tree_data.replace('Provider1',1, inplace=True)
tree_data.replace('Provider2',2, inplace=True)
tree_data.replace('Provider3',3, inplace=True)
tree_data.replace('Provider4',4, inplace=True)
except:
pass
#convert dataframes to arrays
tree_data = tree_data.values
tree_target = tree_target.values
#column names - labels
tree_feature_names = ['lifetime', 'pressureInd', 'moistureInd', 'temperatureInd', 'team', 'provider']
#target names - class
tree_target_names = ['BROKEN!','Operational']
#Tree Classifiers
tree_clf = tree.DecisionTreeClassifier()
#tree_clf.set_params(max_depth=2)
tree_clf = tree_clf.fit(tree_data, tree_target)
tree_clf.get_params()
Out[13]:
In [14]:
#output graph tree
tree_dot_data = tree.export_graphviz(tree_clf,
out_file=None,
feature_names=tree_feature_names,
class_names=tree_target_names,
filled=True,
rounded=True,
special_characters=True)
graph = graphviz.Source(tree_dot_data)
graph.render("Maintenance_classification_tree")
#show tree
graph
Out[14]:
In [15]:
df_init.drop(columns=['broken']).columns
Out[15]:
In [16]:
#PREDICTION WITHOUT REGRESSION - 1-->BROKEN 0-->Operational
print('instance 1 prediction: ', tree_clf.predict([[70., 100., 100., 100., 1., 3.]]))
print('instance 2 prediction: ', tree_clf.predict([[70., 100., 100., 100., 1., 1.]]))
In [ ]: